package com.chenlb.mmseg4j.example;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Mmseg4jRun {
    public static void main(String[] args) throws IOException {
        String action = "Simple";
        if ("Simple".equalsIgnoreCase(action)) {
            runSimple();
        }
        action = "Complex";
        if ("Complex".equalsIgnoreCase(action)) {
            runComplex();
        }
        action = "MaxWord";
        if ("MaxWord".equalsIgnoreCase(action)) {
            runMaxWord();
        }
    }

    public static void runSimple() throws IOException {
        new Simple().run();
    }

    public static void runComplex() throws IOException {
        new Complex().run();
    }

    private static void runMaxWord() throws IOException {

        String dataContent = "<p class=\"MsoNormal\" style=\"text-indent: 24.0pt; mso-char-indent-count: 2.0; line-height: 150%;\"><span style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">金银菜排骨汤</span></p>↵<p class=\"MsoNormal\" style=\"text-indent: 24.0pt; mso-char-indent-count: 2.0; line-height: 150%;\"><span lang=\"EN-US\" style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">[</span><span style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">组成<span lang=\"EN-US\">]</span>菜干<span lang=\"EN-US\">200</span>克，新鲜白菜<span lang=\"EN-US\">200</span>克，排骨<span lang=\"EN-US\">500</span>克，陈皮<span lang=\"EN-US\">10</span>克，调味适量。</span></p>↵<p class=\"MsoNormal\" style=\"text-indent: 24.0pt; mso-char-indent-count: 2.0; line-height: 150%;\"><span lang=\"EN-US\" style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">[</span><span style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">用法<span lang=\"EN-US\">]</span>先将陈皮放入适量的清水一同煲滚，然后把各材料一齐放进已煲好的滚水中煲十分钟后，改用慢火煲约三小时，加盐调味，即可。</span></p>↵<p class=\"MsoNormal\" style=\"text-indent: 24.0pt; mso-char-indent-count: 2.0; line-height: 150%;\"><span lang=\"EN-US\" style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">[</span><span style=\"font-size: 12.0pt; mso-bidi-font-size: 11.0pt; line-height: 150%; font-family: 宋体; mso-no-proof: yes;\">功效<span lang=\"EN-US\">]</span>行气化痰。</span></p>";
//			Document doc = Jsoup.connect("tanmoy_mahathir.makes.org/thimble/146").get();
//			String html= "<html><head></head>" + "<body><p>Parsed HTML into a doc."
//					+ "</p></body></html>";
        Document doc = Jsoup.parse(dataContent);
        Elements paragraphs = doc.select("p");
        System.out.println("====================");
        String str = "中文(123441)数智das:! $%#*(*(,.'。，；";
        String newStr = replaceExprSpecialWordRegExp(str);
        System.out.println(newStr);
        Complex complex = new Complex();
        for (Element p : paragraphs) {
            String txtContent = p.text();
            String dataWords = complex.segWords(txtContent, "/");
            System.out.println(dataWords);
        }
        //	new MaxWord().run();
    }

    /**
     * String str="das:! $%#*(*(";
     * 转义正则特殊字符 （$()*+.[]?\^{}
     * \\需要第一个替换，否则replace方法替换时会有逻辑bug
     */
    public static String replaceExprSpecialWordRegExp(String str) {
        String regEx = "[`~!@#$%^&*()+=|{},':;'\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。，、？]|\n|\r|\t";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(str);
        boolean isfind = m.find();
        System.err.println(isfind);
        //regex
        String replaceAll = str.replaceAll(regEx, "");
        System.out.println(replaceAll);
        return replaceAll;
    }
}
